Dataset Handling

The VQA dataset contains ~120K images and ~330K question/answer pairs and is quite large for this baby VQA project. Therefore, we will extract a small set of it for training and testing our model.


In [1]:
# a bit of setup as usual
import h5py, json

import numpy as np
import cPickle as pickle

from IPython.display import display, Image

0 - Loading the full VQA dataset


In [2]:
h5_img_file_train = h5py.File('data/vqa_data_img_vgg_train.h5', 'r')
fv_im_train = h5_img_file_train.get('/images_train') # 82460 x 14 x 14 x 512

h5_img_file_test = h5py.File('data/vqa_data_img_vgg_test.h5', 'r')
fv_im_test = h5_img_file_test.get('/images_test') # 40504 x 14 x 14 x 512

h5_ques_file = h5py.File('data/vqa_data_prepro.h5', 'r')
ques_train = h5_ques_file.get('/ques_train') # 215375 x 26
ques_len_train = h5_ques_file.get('/ques_len_train') # 215375 x 1
img_pos_train = h5_ques_file.get('/img_pos_train') # 215375 x 1
ques_id_train = h5_ques_file.get('/ques_id_train') # 215375 x 1
answers = h5_ques_file.get('/answers') # 215375 x 1
split_train = h5_ques_file.get('/split_train') # 215375 x 1

ques_test = h5_ques_file.get('/ques_test') # 121512 x26
ques_len_test = h5_ques_file.get('/ques_len_test')
img_pos_test = h5_ques_file.get('/img_pos_test')
ques_id_test = h5_ques_file.get('/ques_id_test')
split_test = h5_ques_file.get('/split_test')
ans_test = h5_ques_file.get('/ans_test')

json_file = json.load(open('data/vqa_data_prepro.json', 'r'))
ix_to_word = json_file['ix_to_word']
ix_to_ans = json_file['ix_to_ans']

vocab_size = len(ix_to_word) # 12604

1 - Extracting a small dataset for training & testing


In [3]:
num_samples = 8000

qa_data_train_small = []
train_im_small_idx = []

for ix in xrange(num_samples * 3):
    qa_data_train_small.append((ques_train[ix], ques_len_train[ix], ix / 3, answers[ix]))
    if ix % 3 == 0:
        train_im_small_idx.append(img_pos_train[ix])

train_im_small = []
for im_ix in train_im_small_idx:
    train_im_small.append(fv_im_train[im_ix, :])

with open('data/qa_data_train_small.pkl', 'wb') as fp:
    pickle.dump(qa_data_train_small, fp)

with h5py.File('data/vqa_data_img_vgg_train_small.h5', 'w') as hf:
    hf.create_dataset('images_train', data=train_im_small)

In [6]:
num_samples = 4000

qa_data_test_small = []
test_im_small_idx = []

for ix in xrange(num_samples * 3):
    qa_data_test_small.append((ques_test[ix], ques_len_test[ix], ix / 3, ans_test[ix]))
    if ix % 3 == 0:
        test_im_small_idx.append(img_pos_test[ix])

test_im_small = []
for im_ix in test_im_small_idx:
    test_im_small.append(fv_im_test[im_ix, :])

with open('data/qa_data_test_small.pkl', 'wb') as fp:
    pickle.dump(qa_data_test_small, fp)

with h5py.File('data/vqa_data_img_vgg_test_small.h5', 'w') as hf:
    hf.create_dataset('images_test', data=test_im_small)

2 - Extracting a tiny dataset for debugging


In [7]:
num_samples = 100

qa_data_train_small = []
train_im_small_idx = []

for ix in xrange(num_samples * 3):
    qa_data_train_small.append((ques_train[ix], ques_len_train[ix], ix / 3, answers[ix]))
    if ix % 3 == 0:
        train_im_small_idx.append(img_pos_train[ix])

train_im_small = []
for im_ix in train_im_small_idx:
    train_im_small.append(fv_im_train[im_ix, :])
    
with open('data/qa_data_train_tiny.pkl', 'wb') as fp:
    pickle.dump(qa_data_train_small, fp)

with h5py.File('data/vqa_data_img_vgg_train_tiny.h5', 'w') as hf:
    hf.create_dataset('images_train', data=train_im_small)

In [8]:
num_samples = 100

qa_data_test_small = []
test_im_small_idx = []

for ix in xrange(num_samples * 3):
    qa_data_test_small.append((ques_test[ix], ques_len_test[ix], ix / 3, ans_test[ix]))
    if ix % 3 == 0:
        test_im_small_idx.append(img_pos_test[ix])

test_im_small = []
for im_ix in test_im_small_idx:
    test_im_small.append(fv_im_test[im_ix, :])
    
with open('data/qa_data_test_tiny.pkl', 'wb') as fp:
    pickle.dump(qa_data_test_small, fp)

with h5py.File('data/vqa_data_img_vgg_test_tiny.h5', 'w') as hf:
    hf.create_dataset('images_test', data=test_im_small)

In [10]:
ques, ques_len, im_ix, ans = zip(*qa_data_test_small)
print [ix_to_word.get(str(ix), 'UNK') for ix in ques[0]], im_ix[0], img_pos_test[0], ix_to_ans.get(str(ans[0]), 'UNK')


[u'what', u'is', u'the', u'table', u'made', u'of', u'?', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK'] 0 20858 wood

In [11]:
raw_test_json = json.load(open('data/vqa_raw_test.json', 'r'))

for i in xrange(0, 30, 3):
    print "Q: %s - GT: %s" % (raw_test_json[i]['question'], raw_test_json[i]['ans'])
    print "Q: %s - GT: %s" % (raw_test_json[i+1]['question'], raw_test_json[i+1]['ans'])
    print "Q: %s - GT: %s" % (raw_test_json[i+2]['question'], raw_test_json[i+2]['ans'])
    display(Image(filename='data/imgs/%d.jpg' % (i/3 + 1)))


Q: What is the table made of? - GT: wood
Q: Is the food napping on the table? - GT: no
Q: What has been upcycled to make lights? - GT: kettles
Q: Is this an Spanish town? - GT: no
Q: Are there shadows on the sidewalk? - GT: yes
Q: What is in the top right corner? - GT: tree
Q: Is it cold outside? - GT: yes
Q: What is leaning against the house? - GT: ladder
Q: How many windows can you see? - GT: 1
Q: Is this in a park? - GT: yes
Q: Is there a bicycle in this picture? - GT: yes
Q: Is the person feeding the birds? - GT: no
Q: Is this a Girl Scout? - GT: yes
Q: What uniform is she wearing? - GT: girl scout
Q: What color is the fence? - GT: white
Q: What color is the linoleum? - GT: blue
Q: Is the water running in the sink? - GT: no
Q: How is the floor made? - GT: tile
Q: What is the teddy bear sitting on? - GT: stool
Q: Do children like this object? - GT: yes
Q: What is written on the teddy bear's feet? - GT: my 1st tatty teddy
Q: Is the weather warm in this picture? - GT: no
Q: How many people are in this photo? - GT: 2
Q: Why would the snowmobiler be riding up the mountain for the skier? - GT: rescue
Q: How many people can the red buses hold? - GT: 120
Q: Are the red buses identical? - GT: no
Q: How many double-decker buses are in the picture? - GT: 3
Q: Where is this picture? - GT: market
Q: How many fruits and veggies are there? - GT: lot
Q: How many boxes of green apples are there? - GT: 4